library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  3.0.0     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(Seurat)
library(dplyr)
pbmc.data <- Read10X(data.dir = "/Users/baigal/Desktop/vcftools/scRNA/filtered_feature_bc_matrix")

pbmc <- CreateSeuratObject(counts = pbmc.data, min.cells = 3, min.features  = 200, project = "5k_PBMC", assay = "RNA")

dense.size <- object.size(x = as.matrix(x = pbmc.data))
dense.size
## 1350935128 bytes
sparse.size <- object.size(x = pbmc.data)
sparse.size
## 139163168 bytes
pbmc <- CreateSeuratObject(counts = pbmc.data, min.cells = 3, min.features  = 200, project = "5k_PBMC", assay = "RNA")
pbmc
## An object of class Seurat 
## 18791 features across 4962 samples within 1 assay 
## Active assay: RNA (18791 features)

Quality Control

mito.genes <- grep(pattern = "^MT-", x = rownames(pbmc@assays[["RNA"]]), value = TRUE)
percent.mito <- Matrix::colSums(pbmc@assays[["RNA"]][mito.genes, ])/Matrix::colSums(pbmc@assays[["RNA"]])
pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito") 

VlnPlot(object = pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3)

par(mfrow = c(1, 2))
FeatureScatter(object = pbmc, feature1 = "nCount_RNA", feature2 = "percent.mito")

FeatureScatter(object = pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

pbmc <- subset(x = pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mito >  -Inf & percent.mito < 0.25 )

Normalizing Data_LogNormalize

pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

Detection of variable genes

pbmc <- FindVariableFeatures(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, nfeatures = 2000)

top10 <- head(VariableFeatures(pbmc), 10)

plot1 <- VariableFeaturePlot(pbmc)
plot1

plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
## When using repel, set xnudge and ynudge to 0 for optimal results
plot2

##Data Scaling

pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nCounts_RNA", "percent.mito"))
## Regressing out nCounts_RNA, percent.mito
## Centering and scaling data matrix

##PCA

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc), verbose = FALSE)
DimPlot(pbmc, reduction = "pca")

pbmc <- JackStraw(pbmc, num.replicate = 100, dims = 50)
pbmc <- ScoreJackStraw(pbmc, dims = 1:50)

JackStrawPlot(pbmc, dims = 1:30, reduction = "pca")
## Warning: Removed 44221 rows containing missing values (geom_point).

ElbowPlot(pbmc, ndims = 50)

Computing the neighborhood

pbmc <- FindNeighbors(pbmc, reduction = "pca", dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 0.5, algorithm = 1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4595
## Number of edges: 166042
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8998
## Number of communities: 14
## Elapsed time: 0 seconds
pbmc <- RunTSNE(object = pbmc, dims.use = 1:20)
pbmc <- RunUMAP(pbmc, dims = 1:20)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 13:27:53 UMAP embedding parameters a = 0.9922 b = 1.112
## 13:27:53 Read 4595 rows and found 20 numeric columns
## 13:27:53 Using Annoy for neighbor search, n_neighbors = 30
## 13:27:53 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 13:27:54 Writing NN index file to temp file /var/folders/93/dtv9y3wx3_bfwbx4tfjq5q500000gn/T//RtmpHxpLjP/file184ca24e04ea6
## 13:27:54 Searching Annoy index using 1 thread, search_k = 3000
## 13:27:56 Annoy recall = 100%
## 13:27:56 Commencing smooth kNN distance calibration using 1 thread
## 13:27:57 Initializing from normalized Laplacian + noise
## 13:27:57 Commencing optimization for 500 epochs, with 192070 positive edges
## 13:28:06 Optimization finished
DimPlot(object = pbmc, reduction = "tsne")

DimPlot(pbmc, reduction = "umap", label = TRUE)

DimPlot(pbmc, reduction = "pca")

Marker Detection

cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 1, min.pct = 0.25)
print(x = head(x = cluster1.markers, n = 5))
##        p_val avg_logFC pct.1 pct.2 p_val_adj
## S100A8     0  4.143277 1.000 0.123         0
## S100A9     0  4.125020 1.000 0.233         0
## LYZ        0  3.202571 1.000 0.395         0
## VCAN       0  2.577781 0.992 0.071         0
## MNDA       0  2.564347 1.000 0.109         0
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
## # A tibble: 28 x 7
## # Groups:   cluster [14]
##        p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene   
##        <dbl>     <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>  
##  1 0.            0.943 0.856 0.227 0.        0       CCR7   
##  2 0.            0.889 0.802 0.195 0.        0       TRABD2A
##  3 0.            4.14  1     0.123 0.        1       S100A8 
##  4 0.            4.13  1     0.233 0.        1       S100A9 
##  5 1.25e-224     1.26  0.925 0.43  2.35e-220 2       IL7R   
##  6 2.46e-198     1.23  0.644 0.146 4.62e-194 2       KLRB1  
##  7 1.29e-269     2.00  0.988 0.209 2.42e-265 3       CCL5   
##  8 1.20e-164     1.61  0.576 0.085 2.26e-160 3       GZMK   
##  9 0.            3.46  0.993 0.119 0.        4       GNLY   
## 10 1.98e-288     2.90  1     0.224 3.72e-284 4       NKG7   
## # … with 18 more rows
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
DoHeatmap(object = pbmc, features = top10$gene, label = TRUE)
## Warning in DoHeatmap(object = pbmc, features = top10$gene, label = TRUE): The
## following features were omitted as they were not found in the scale.data slot
## for the RNA assay: NUCB2, LUC7L3, CSKMT, INTS6, NKTR, HEXIM1, MT-ND5, MALAT1,
## MT-CYB, NOSIP, SARAF, TCF7, LDHB

Assign annotation for differenct celltypes (ref: bmc_10k_v3.rds)

##read the reference data
pbmc.10k<- readRDS("/Users/baigal/Desktop/vcftools/scRNA/pbmc10k/pbmc_10k_v3.rds")

p1<- DimPlot(pbmc.10k, group.by = "celltype", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("10k pbmc")

p2<- DimPlot(pbmc, group.by = "seurat_clusters", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("5k pbmc")

CombinePlots(plots = list(p1, p2))
## Warning: CombinePlots is being deprecated. Plots should now be combined using
## the patchwork system.

transfer.anchors <- FindTransferAnchors(reference = pbmc.10k, query = pbmc, features = VariableFeatures(object = pbmc.10k), reference.assay = "RNA", query.assay = "RNA", reduction = "pcaproject")
## Warning: Adding a Graph without an assay associated with it
## Warning: Adding a Graph without an assay associated with it

## Warning: Adding a Graph without an assay associated with it

## Warning: Adding a Graph without an assay associated with it
## Performing PCA on the provided reference using 3000 features as input.
## Projecting PCA
## Finding neighborhoods
## Finding anchors
##  Found 7790 anchors
## Filtering anchors
##  Retained 5062 anchors
## Extracting within-dataset neighbors
celltype.predictions <- TransferData(anchorset = transfer.anchors, refdata = pbmc.10k$celltype, 
    dims = 1:30)
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
pbmc<- AddMetaData(pbmc, metadata = celltype.predictions)
DimPlot(pbmc, group.by = "predicted.id", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("5k pbmc")